/*
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 2.0
//              Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/

#include <Kokkos_Core.hpp>

namespace TestAtomic {

// Struct for testing arbitrary size atomics.

template< int N >
struct SuperScalar {
  double val[N];

  KOKKOS_INLINE_FUNCTION
  SuperScalar() {
    for ( int i = 0; i < N; i++ ) {
      val[i] = 0.0;
    }
  }

  KOKKOS_INLINE_FUNCTION
  SuperScalar( const SuperScalar & src ) {
    for ( int i = 0; i < N; i++ ) {
      val[i] = src.val[i];
    }
  }

  KOKKOS_INLINE_FUNCTION
  SuperScalar( const volatile SuperScalar & src ) {
    for ( int i = 0; i < N; i++ ) {
      val[i] = src.val[i];
    }
  }

  KOKKOS_INLINE_FUNCTION
  SuperScalar& operator=( const SuperScalar & src ) {
    for ( int i = 0; i < N; i++ ) {
      val[i] = src.val[i];
    }
    return *this;
  }

  KOKKOS_INLINE_FUNCTION
  SuperScalar& operator=( const volatile SuperScalar & src ) {
    for ( int i = 0; i < N; i++ ) {
      val[i] = src.val[i];
    }
    return *this;
  }

  KOKKOS_INLINE_FUNCTION
  void operator=( const SuperScalar & src ) volatile  {
    for ( int i = 0; i < N; i++ ) {
      val[i] = src.val[i];
    }
  }

  KOKKOS_INLINE_FUNCTION
  SuperScalar operator+( const SuperScalar & src ) {
    SuperScalar tmp = *this;
    for ( int i = 0; i < N; i++ ) {
      tmp.val[i] += src.val[i];
    }
    return tmp;
  }

  KOKKOS_INLINE_FUNCTION
  SuperScalar& operator+=( const double & src ) {
    for ( int i = 0; i < N; i++ ) {
      val[i] += 1.0 * ( i + 1 ) * src;
    }
    return *this;
  }

  KOKKOS_INLINE_FUNCTION
  SuperScalar& operator+=( const SuperScalar & src ) {
    for ( int i = 0; i < N; i++ ) {
      val[i] += src.val[i];
    }
    return *this;
  }

  KOKKOS_INLINE_FUNCTION
  bool operator==( const SuperScalar & src ) {
    bool compare = true;
    for( int i = 0; i < N; i++ ) {
      compare = compare && ( val[i] == src.val[i] );
    }
    return compare;
  }

  KOKKOS_INLINE_FUNCTION
  bool operator!=( const SuperScalar & src ) {
    bool compare = true;
    for ( int i = 0; i < N; i++ ) {
      compare = compare && ( val[i] == src.val[i] );
    }
    return !compare;
  }

  KOKKOS_INLINE_FUNCTION
  SuperScalar( const double & src ) {
    for ( int i = 0; i < N; i++ ) {
      val[i] = 1.0 * ( i + 1 ) * src;
    }
  }
};

template< int N >
std::ostream & operator<<( std::ostream & os, const SuperScalar< N > & dt )
{
  os << "{ ";
  for ( int  i = 0; i < N - 1; i++ ) {
     os << dt.val[i] << ", ";
  }
  os << dt.val[N-1] << "}";

  return os;
}

template< class T, class DEVICE_TYPE >
struct ZeroFunctor {
  typedef DEVICE_TYPE execution_space;
  typedef typename Kokkos::View< T, execution_space > type;
  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;

  type data;

  KOKKOS_INLINE_FUNCTION
  void operator()( int ) const {
    data() = 0;
  }
};

//---------------------------------------------------
//--------------atomic_fetch_add---------------------
//---------------------------------------------------

template< class T, class DEVICE_TYPE >
struct AddFunctor {
  typedef DEVICE_TYPE execution_space;
  typedef Kokkos::View< T, execution_space > type;

  type data;

  KOKKOS_INLINE_FUNCTION
  void operator()( int ) const {
    Kokkos::atomic_fetch_add( &data(), (T) 1 );
  }
};

template< class T, class DEVICE_TYPE >
struct AddFunctorReduce {
  typedef DEVICE_TYPE execution_space;
  typedef Kokkos::View< T, execution_space > type;

  type data;

  KOKKOS_INLINE_FUNCTION
  void operator()( int , int& ) const {
    Kokkos::atomic_fetch_add( &data(), (T) 1 );
  }
};

template< class T, class execution_space >
T AddLoop( int loop ) {
  struct ZeroFunctor< T, execution_space > f_zero;
  typename ZeroFunctor< T, execution_space >::type data( "Data" );
  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );

  f_zero.data = data;

  Kokkos::parallel_for( 1, f_zero );
  execution_space().fence();

  struct AddFunctor< T, execution_space > f_add;

  f_add.data = data;
  Kokkos::parallel_for( loop, f_add );
  execution_space().fence();

  Kokkos::deep_copy( h_data, data );
  T val = h_data();

  struct AddFunctorReduce< T, execution_space > f_add_red;
  f_add_red.data = data;
  int dummy_result;
  Kokkos::parallel_reduce( loop, f_add_red , dummy_result );
  execution_space().fence();

  return val;
}

template< class T >
T AddLoopSerial( int loop ) {
  T* data = new T[1];
  data[0] = 0;

  for ( int i = 0; i < loop; i++ ) {
    *data += (T) 1;
  }

  T val = *data;
  delete [] data;

  return val;
}

//------------------------------------------------------
//--------------atomic_compare_exchange-----------------
//------------------------------------------------------

template< class T, class DEVICE_TYPE >
struct CASFunctor {
  typedef DEVICE_TYPE execution_space;
  typedef Kokkos::View< T, execution_space > type;

  type data;

  KOKKOS_INLINE_FUNCTION
  void operator()( int ) const {
    T old = data();
    T newval, assumed;

    do {
      assumed = old;
      newval = assumed + (T) 1;
      old = Kokkos::atomic_compare_exchange( &data(), assumed, newval );
    } while( old != assumed );
  }
};

template< class T, class DEVICE_TYPE >
struct CASFunctorReduce {
  typedef DEVICE_TYPE execution_space;
  typedef Kokkos::View< T, execution_space > type;

  type data;

  KOKKOS_INLINE_FUNCTION
  void operator()( int , int& ) const {
    T old = data();
    T newval, assumed;

    do {
      assumed = old;
      newval = assumed + (T) 1;
      old = Kokkos::atomic_compare_exchange( &data(), assumed, newval );
    } while( old != assumed );
  }
};

template< class T, class execution_space >
T CASLoop( int loop ) {
  struct ZeroFunctor< T, execution_space > f_zero;
  typename ZeroFunctor< T, execution_space >::type data( "Data" );
  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );

  f_zero.data = data;
  Kokkos::parallel_for( 1, f_zero );
  execution_space().fence();

  struct CASFunctor< T, execution_space > f_cas;
  f_cas.data = data;
  Kokkos::parallel_for( loop, f_cas );
  execution_space().fence();

  Kokkos::deep_copy( h_data, data );
  T val = h_data();

  struct CASFunctorReduce< T, execution_space > f_cas_red;
  f_cas_red.data = data;
  int dummy_result;
  Kokkos::parallel_reduce( loop, f_cas_red , dummy_result );
  execution_space().fence();

  return val;
}

template< class T >
T CASLoopSerial( int loop ) {
  T* data = new T[1];
  data[0] = 0;

  for ( int i = 0; i < loop; i++ ) {
    T assumed;
    T newval;
    T old;

    do {
      assumed = *data;
      newval = assumed + (T) 1;
      old = *data;
      *data = newval;
    } while( !( assumed == old ) );
  }

  T val = *data;
  delete [] data;

  return val;
}

//----------------------------------------------
//--------------atomic_exchange-----------------
//----------------------------------------------

template< class T, class DEVICE_TYPE >
struct ExchFunctor {
  typedef DEVICE_TYPE execution_space;
  typedef Kokkos::View< T, execution_space > type;

  type data, data2;

  KOKKOS_INLINE_FUNCTION
  void operator()( int i ) const {
    T old = Kokkos::atomic_exchange( &data(), (T) i );
    Kokkos::atomic_fetch_add( &data2(), old );
  }
};

template< class T, class DEVICE_TYPE >
struct ExchFunctorReduce {
  typedef DEVICE_TYPE execution_space;
  typedef Kokkos::View< T, execution_space > type;

  type data, data2;

  KOKKOS_INLINE_FUNCTION
  void operator()( int i , int& ) const {
    T old = Kokkos::atomic_exchange( &data(), (T) i );
    Kokkos::atomic_fetch_add( &data2(), old );
  }
};

template< class T, class execution_space >
T ExchLoop( int loop ) {
  struct ZeroFunctor< T, execution_space > f_zero;
  typename ZeroFunctor< T, execution_space >::type data( "Data" );
  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );

  f_zero.data = data;
  Kokkos::parallel_for( 1, f_zero );
  execution_space().fence();

  typename ZeroFunctor< T, execution_space >::type data2( "Data" );
  typename ZeroFunctor< T, execution_space >::h_type h_data2( "HData" );

  f_zero.data = data2;
  Kokkos::parallel_for( 1, f_zero );
  execution_space().fence();

  struct ExchFunctor< T, execution_space > f_exch;
  f_exch.data = data;
  f_exch.data2 = data2;
  Kokkos::parallel_for( loop, f_exch );
  execution_space().fence();

  Kokkos::deep_copy( h_data, data );
  Kokkos::deep_copy( h_data2, data2 );
  T val = h_data() + h_data2();

  struct ExchFunctorReduce< T, execution_space > f_exch_red;
  f_exch_red.data = data;
  f_exch_red.data2 = data2;
  int dummy_result;
  Kokkos::parallel_reduce( loop, f_exch_red , dummy_result );
  execution_space().fence();

  return val;
}

template< class T >
T ExchLoopSerial( typename std::conditional< !std::is_same< T, Kokkos::complex<double> >::value, int, void >::type loop ) {
  T* data = new T[1];
  T* data2 = new T[1];
  data[0] = 0;
  data2[0] = 0;

  for ( int i = 0; i < loop; i++ ) {
    T old = *data;
    *data = (T) i;
    *data2 += old;
  }

  T val = *data2 + *data;
  delete [] data;
  delete [] data2;

  return val;
}

template< class T >
T ExchLoopSerial( typename std::conditional< std::is_same< T, Kokkos::complex<double> >::value, int, void >::type loop ) {
  T* data = new T[1];
  T* data2 = new T[1];
  data[0] = 0;
  data2[0] = 0;

  for ( int i = 0; i < loop; i++ ) {
    T old = *data;
    data->real() = ( static_cast<double>( i ) );
    data->imag() = 0;
    *data2 += old;
  }

  T val = *data2 + *data;
  delete [] data;
  delete [] data2;

  return val;
}

template< class T, class DeviceType >
T LoopVariant( int loop, int test ) {
  switch ( test ) {
    case 1: return AddLoop< T, DeviceType >( loop );
    case 2: return CASLoop< T, DeviceType >( loop );
    case 3: return ExchLoop< T, DeviceType >( loop );
  }

  return 0;
}

template< class T >
T LoopVariantSerial( int loop, int test ) {
  switch ( test ) {
    case 1: return AddLoopSerial< T >( loop );
    case 2: return CASLoopSerial< T >( loop );
    case 3: return ExchLoopSerial< T >( loop );
  }

  return 0;
}

template< class T, class DeviceType >
bool Loop( int loop, int test )
{
  T res       = LoopVariant< T, DeviceType >( loop, test );
  T resSerial = LoopVariantSerial< T >( loop, test );

  bool passed = true;

  if ( resSerial != res ) {
    passed = false;

    std::cout << "Loop<"
              << typeid( T ).name()
              << ">( test = "
              << test << " FAILED : "
              << resSerial << " != " << res
              << std::endl;
  }

  return passed;
}

} // namespace TestAtomic

namespace Test {

TEST_F( TEST_CATEGORY, atomics )
{
  const int loop_count = 1e4;

  ASSERT_TRUE( ( TestAtomic::Loop< int, TEST_EXECSPACE >( loop_count, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< int, TEST_EXECSPACE >( loop_count, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< int, TEST_EXECSPACE >( loop_count, 3 ) ) );

  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, TEST_EXECSPACE >( loop_count, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, TEST_EXECSPACE >( loop_count, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, TEST_EXECSPACE >( loop_count, 3 ) ) );

  ASSERT_TRUE( ( TestAtomic::Loop< long int, TEST_EXECSPACE >( loop_count, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< long int, TEST_EXECSPACE >( loop_count, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< long int, TEST_EXECSPACE >( loop_count, 3 ) ) );

  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, TEST_EXECSPACE >( loop_count, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, TEST_EXECSPACE >( loop_count, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, TEST_EXECSPACE >( loop_count, 3 ) ) );

  ASSERT_TRUE( ( TestAtomic::Loop< long long int, TEST_EXECSPACE >( loop_count, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< long long int, TEST_EXECSPACE >( loop_count, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< long long int, TEST_EXECSPACE >( loop_count, 3 ) ) );

  ASSERT_TRUE( ( TestAtomic::Loop< double, TEST_EXECSPACE >( loop_count, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< double, TEST_EXECSPACE >( loop_count, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< double, TEST_EXECSPACE >( loop_count, 3 ) ) );

  ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 3 ) ) );

#ifndef KOKKOS_ENABLE_OPENMPTARGET
#ifndef KOKKOS_ENABLE_ROCM // ROCM doesn't yet support atomics for >64bit types
  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 1, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 1, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 1, 3 ) ) );

  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 3 ) ) );

  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 1 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 2 ) ) );
  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 3 ) ) );
#endif
#endif
}


} // namespace Test

